Setup

Install and Attach

installr provides require2, this will install a package if it is missing and library it. Unfortunetly, intall is a package too, so you cannot use require2 on it.

if(!require(installr))install.packages("installr")
library(installr)

## https://rstudio.github.io/distill/tables.html

require2(rmarkdown)
require2(kableExtra)
require2(tidyverse)
require2(glue)
require2(readr)
require2(plotly)
require2(readr)
require2(readxl)
require2(lubridate)
require2(curl)
require2(epidata)

Set up the Knitted table

This will automaticaly detect if the document is being knited and apply the provided table formatting function or rmarkdown::paged_table if not provided. If nhead or ntail it will call the head or tail function respectively and limit the data. On 0, it will ignore it. The default is to create a paginated table on overflow so all the data is accessable but does not take the entire screen.

disp=function(tbl, nhead=0, ntail=0, style=paged_table){
    if(!is.function(style))style=function(t){
        kbl(t)%>%
            style()%>%
            return()
    }
    ## If the code is kniting
    if(isTRUE(getOption('knitr.in.progress'))){
        if(nhead!=0)tbl=head(tbl, n=nhead)
        if(ntail!=0)tbl=tail(tbl, n=ntail)
        return(
            tbl%>%
        style()
        )
    }
    ## Otherwise just return the raw tible to be formated by RStudio
    return(tbl)
}
mtcars%>%disp()
mtcars%>%disp(nhead = 20)
mtcars%>%disp(ntail = 10)
mtcars%>%disp(style = function(t){
    kbl(t)%>%
            style()%>%
            return()
})

Import Data

Make sure we have internet and if not abort if not

if(!curl::has_internet())quit()

Import cpsaat Data

cpsaat data is provided online at bls.gov. As it is a direct link we can donload it and save it to a temporary file and process the data with readxl::read_excel()

## Create a temp file name/location
tmp <- tempfile()
## Download cpsaat data
curl_download("https://www.bls.gov/cps/cpsaat11.xlsx", destfile = tmp)

## Import cpsaat
cpsaat11 <- read_excel(
        tmp,
        col_names = c(
            "Occupation",
            "Total",
            "Women",
            "White",
            "Black/African American",
            "Asian",
            "Hispanic/Latino"
        ),
        na = "–",
        col_types = c(
            Occupation="text",
            Total="numeric",
            "Women"="numeric",
            "White"="numeric",
            "Black/African American"="numeric",
            "Asian"="numeric",
            "Hispanic/Latino"="numeric"
        ),
        skip = 7
    )%>%
    drop_na(Occupation)
## Remove temp file and var
file.remove(tmp)
## [1] TRUE
rm(tmp)

Import EPI Data

Get the data at EPI. As there is no direct link avalable we cannot use curl, instead there is a package that we can use to access the data, epidata. This will download data in the background.

Labor_force_participation <- epidata::get_labor_force_participation_rate(by = "gr")

Medianaverage_hourly_wages <- epidata::get_median_and_mean_wages(by = "gr")

Minimum_wage <- epidata::get_minimum_wage()

Clean Data

The data is in a terrible format for use in ggplot2, and we call this wide format as it has many columns. To fix this we can convert it into long format, as there are many rows, with pivot_longer.

Clean cpsaat11

cpsaat11%>%disp()
cpsaat11=cpsaat11%>%
    pivot_longer(-c(Occupation, Total), names_to = "Race", values_to = "Percentage")

Looks fine.

Clean Labor_force_participation

Labor_force_participation%>%disp()
Participation=Labor_force_participation%>%
    pivot_longer(-date, names_to = "Race", values_to = "Participation", values_drop_na = T)%>%
    separate(Race, into = c("Race", "Gender"))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 3012 rows [1, 2,
## 3, 4, 7, 10, 13, 14, 15, 16, 19, 22, 25, 26, 27, 28, 31, 34, 37, 38, ...].
Participation=Participation%>%
    filter(grepl("women|men", Race, ignore.case = T))%>%
    mutate(
        Gender=Race,
        Race=NA_character_
    )%>%
    union(
        Participation%>%
            filter(!grepl("women|men", Race, ignore.case = T))
    )
Participation%>%
    filter(!is.na(Race))
## # A tibble: 5,020 x 4
##    date       Race     Gender Participation
##    <date>     <chr>    <chr>          <dbl>
##  1 1978-12-01 all      <NA>           0.634
##  2 1978-12-01 black    <NA>           0.617
##  3 1978-12-01 black    women          0.535
##  4 1978-12-01 black    men            0.718
##  5 1978-12-01 hispanic <NA>           0.633
##  6 1978-12-01 hispanic women          0.47 
##  7 1978-12-01 hispanic men            0.812
##  8 1978-12-01 white    <NA>           0.635
##  9 1978-12-01 white    women          0.499
## 10 1978-12-01 white    men            0.785
## # ... with 5,010 more rows
rm(Labor_force_participation)

Clean Medianaverage_hourly_wages

Medianaverage_hourly_wages%>%disp()
Wages=Medianaverage_hourly_wages%>%
    pivot_longer(-date, names_to = "Race", values_to = "Wage", values_drop_na = T)%>%
    separate(Race, into = c("Race", "Gender", "Summary"), fill = "left")

## Race is in the wrong location sometimes
Wages=Wages%>%
    filter(!grepl("women|men", Gender, ignore.case = T))%>%
    mutate(
        Race=Gender,
        Gender=NA_character_
    )%>%
    union(
        Wages%>%
            filter(grepl("women|men", Gender, ignore.case = T))
    )
## No need to keep the Average and Median split up
Wages=Wages%>%
    pivot_wider(names_from = Summary, values_from = Wage)
rm(Medianaverage_hourly_wages)

Clean Minimum_wage

This data has data in terms of 2018, the other data is in 2019 USD. Although small, there will be a difference and we need to adjust for inflation. The package priceR allows us to convert those monitary values into other ones using online inflation data.

Minimum_wage%>%disp()
##adjust for inflation to get to common 2019
Minimum_wage=Minimum_wage%>%
    mutate(
        Min2019=priceR::adjust_for_inflation(
            federal_minimum_wage_real_x_2018_dollars,
            2018,
            "US",
            2019
        )
    )
## Retrieving countries data
## Generating URL to request all 297 results
## Retrieving inflation data for US 
## Generating URL to request all 61 results
Minimum_wage=Minimum_wage%>%
    rename(MinCur=federal_minimum_wage_nominal_dollars)%>%
    select(Min2019, MinCur, date)

Fix inconsistant case

As the data was imported with epidata, the colum names have been changed from what the csv has. So we need to fix that to conform to consistancy.

Wages=Wages%>%
    rename(
        Date=date,
        Median=median,
        Average=average
    )

Participation=Participation%>%
    rename(Date=date)

Minimum_wage=Minimum_wage%>%
    rename(Date=date)

Graph

Average and Medium Wage over Time by Race and Gender

g=Wages%>%
    ggplot(aes(col=Race, x=Date))+
    geom_line(aes(y=Average))+
    geom_line(aes(y=Min2019, col=NULL), data=Minimum_wage, size=2)+
    facet_wrap(~Gender)
ggplotly(g)
g=Wages%>%
    ggplot(aes(col=Race, x=Date))+
    geom_line(aes(y=Median))+
    geom_line(aes(y=Min2019, col=NULL), data=Minimum_wage, size=2)+
    facet_wrap(~Gender)
ggplotly(g)

Scatter Plot over Time

g=Wages%>%
    ggplot()+
    geom_point(aes(x=Median, y=Average, col=Race, shape=Gender, frame=Date))+
    ggtitle("Median vs Average Wage per Race and Gender over Time")
## Warning: Ignoring unknown aesthetics: frame
ggplotly(g)

Wages according to Jobs

Sumarise data according to income of jobs

cpsaat11%>%
    ggplot(aes(x=log(Total)))+
    geom_boxplot()
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

## Generate the percentiles
se=quantile(log(cpsaat11$Total), seq(0, 1, by=.1), na.rm=T)

## Add outlyers
se["200%"]=Inf

## break into groups and drop NAs
d=cpsaat11%>%
    drop_na(Percentage)%>%
    group_by(gr=cut(Total, breaks=exp(se)), Race)

## Sumarise the data and remove women as it is not a race
## This is so it alld up to 100% or so
d=d%>%
    summarise(Percentage=mean(Percentage), Total=mean(Total))%>%
    filter(Race!="Women")
## `summarise()` has grouped output by 'gr'. You can override using the `.groups` argument.
d
## # A tibble: 32 x 4
## # Groups:   gr [8]
##    gr       Race                   Percentage Total
##    <fct>    <chr>                       <dbl> <dbl>
##  1 (40,60]  Asian                        3.72  53.7
##  2 (40,60]  Black/African American      10.9   53.7
##  3 (40,60]  Hispanic/Latino             16.9   53.7
##  4 (40,60]  White                       82.1   53.7
##  5 (60,93]  Asian                        8.60  74.6
##  6 (60,93]  Black/African American      13.1   74.6
##  7 (60,93]  Hispanic/Latino             14.1   74.6
##  8 (60,93]  White                       74.6   74.6
##  9 (93,131] Asian                        5.88 110. 
## 10 (93,131] Black/African American      11.9  110. 
## # ... with 22 more rows
## Is ther missing data
cpsaat11%>%
    drop_na(Percentage)%>%
    filter(Total<30)
## # A tibble: 0 x 4
## # ... with 4 variables: Occupation <chr>, Total <dbl>, Race <chr>,
## #   Percentage <dbl>
g=d%>%
    ggplot(aes(fill=Race, y=Percentage, x=gr))+
    geom_col()
ggplotly(g)
g=d%>%
    ggplot(aes(fill=Race, y=Percentage*Total, x=gr))+
    geom_col(position = "dodge2")+
    scale_y_log10()
ggplotly(g)
g=d%>%
    ggplot(aes(fill=gr, x=1, y=Percentage))+
    geom_col(position = "dodge2")+
    facet_wrap(~Race)
ggplotly(g)
g=d%>%
    ggplot(aes(fill=gr, x=1, y=Percentage*Total))+
    geom_col(position = "dodge2")+
    facet_wrap(~Race)+
    scale_y_log10()
ggplotly(g)
g=d%>%
    ggplot(aes(fill=Race, x=1, y=Percentage*Total))+
    geom_col(position = "dodge2")+
    facet_wrap(~gr)
ggplotly(g)